library(DESeq2)
library(pheatmap)
library(dplyr)
library(dendextend)
library(ggplot2)
#Daniela File path
design_matrix<-read.table('/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/source_files/Experimental_Design_TG (1).csv',sep=',',header=TRUE)
head(design_matrix)
#Tasnim File path
#design_matrix<-read.table('/Users/tasnimtabassum/Documents/Transcriptomics_SP22/Experimental_Design_TG.csv',sep=',',header=TRUE)
#head(design_matrix)
rownames(design_matrix)<-design_matrix$Sample
design_matrix$Sample<-NULL
design_matrix
counts_matrix<-read.table("/Users/danielaquijano/Documents/GitHub/Transcriptomics-Final-Project-/Count_Tables/allcounts.csv",sep=',',header=TRUE)
counts_matrix
Because the numbers after the dot in the ensembl IDs represent versions of genes in certain annotations, we can remove these to more easily conduct our differential gene expression analysis.
counts_matrix$V1<-gsub("\\..*","",counts_matrix$V1)
counts_matrix
# remove the "V1" from col 1
rownames(counts_matrix)<-counts_matrix$V1
counts_matrix$V1<-NULL
#head(counts_matrix)
counts_matrix<-counts_matrix[,order(colnames(counts_matrix))]
counts_matrix
design_matrix<-design_matrix[order(rownames(design_matrix)),]
design_matrix
NA
design_matrix$Age = factor(design_matrix$Age)
design_matrix$Age
[1] eight eight two two eight eight two two two eight eight eight
[13] two two eight eight two two two two eight eight eight eight
[25] two two six six six six twelve twelve twelve twelve twelve twelve
[37] twelve twelve six six twelve twelve six six twelve twelve six six
[49] six six twelve twelve six six twelve twelve six six twelve twelve
[61] twelve twelve twelve twelve six six six six two two eight eight
Levels: eight six twelve two
dds <- DESeqDataSetFromMatrix(countData = counts_matrix,
colData = design_matrix,
design = ~ Age+Genotype)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
dds
class: DESeqDataSet
dim: 46075 72
metadata(1): version
assays(1): counts
rownames(46075): ENSMUSG00000000001 ENSMUSG00000000003 ... N_noFeature N_unmapped
rowData names(0):
colnames(72): SRR8512301 SRR8512302 ... SRR8512439 SRR8512440
colData names(3): Model Genotype Age
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 13457 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
normalizedcounts.matrix <- counts(dds,normalized=T)
vst_dds <- vst(dds)
dists <- dist(t(assay(vst_dds)))
head(vst_dds)
class: DESeqTransform
dim: 6 72
metadata(1): version
assays(1): ''
rownames(6): ENSMUSG00000000001 ENSMUSG00000000028 ... ENSMUSG00000000049
ENSMUSG00000000056
rowData names(43): baseMean baseVar ... replace dispFit
colnames(72): SRR8512301 SRR8512302 ... SRR8512439 SRR8512440
colData names(5): Model Genotype Age sizeFactor replaceable
PCA_Genotype<-plotPCA(vst_dds,intgroup=c("Age"))+labs(title = "PCA of mice of different ages", color = "Group")+coord_fixed(ratio=3)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
PCA_Genotype
resultsNames(dds)
[1] "Intercept" "Age_six_vs_eight" "Age_twelve_vs_eight"
[4] "Age_two_vs_eight" "Genotype_rtg4510_vs_J20" "Genotype_WT_vs_J20"
[7] "Genotype_WT_TG_vs_J20"
dds$Age
[1] eight eight two two eight eight two two two eight eight eight
[13] two two eight eight two two two two eight eight eight eight
[25] two two six six six six twelve twelve twelve twelve twelve twelve
[37] twelve twelve six six twelve twelve six six twelve twelve six six
[49] six six twelve twelve six six twelve twelve six six twelve twelve
[61] twelve twelve twelve twelve six six six six two two eight eight
Levels: eight six twelve two
#Compare 2 vs 8
res_1 <- results(dds, contrast = c("Age", "two", "eight"))
res1_ordered <- res_1[order(res_1$padj),]
head(res1_ordered)
log2 fold change (MLE): Age two vs eight
Wald test p-value: Age two vs eight
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000024501 1037.7531 0.574615 0.0533228 10.77617 4.46085e-27 8.33420e-23
ENSMUSG00000030577 10.9689 -4.378147 0.4088867 -10.70748 9.38804e-27 8.76984e-23
ENSMUSG00000026303 45.6887 -1.152385 0.1270487 -9.07042 1.18562e-19 7.38363e-16
ENSMUSG00000030124 162.9878 -1.526436 0.1688798 -9.03859 1.58703e-19 7.41263e-16
ENSMUSG00000046805 863.4728 -1.317521 0.1476032 -8.92610 4.41290e-19 1.64892e-15
ENSMUSG00000029816 69.7214 -1.694819 0.1920511 -8.82483 1.09623e-18 3.41349e-15
plotDispEsts(dds)
Install Mouse annotation library:
library(biomaRt) #For conversion of transcript IDs to gene ID
library(annotables) #to retrieve grcm38 annotation for mouse genome
library(org.Mm.eg.db) #Mouse genome annotation
library(DOSE)
library(pathview)
library(clusterProfiler)
library(AnnotationHub)
library(ensembldb)
library(tidyverse)
library(ggnewscale)
# mouse genome load
grcm38
# check that ensgene in our df is prsent in the mouse genome df
idx <- grcm38$ensgene %in% rownames(res1_ordered)
# head(idx)
# df with all the ids that are in our df from the mouse genome df
ids <- grcm38[idx, ]
# head(ids)
# remove duplicates
non_duplicates <- which(duplicated(ids$ensgene) == FALSE)
ids <- ids[non_duplicates, ]
#nrow(res1_ordered)
#rownames(res1_ordered)
# entrezID contains only the IDs that are also in our df
ensgeneID= grcm38[grcm38$ensgene %in% rownames(res1_ordered), ]
# entrezID contains only the IDs that are also in our df
entrezID= grcm38[grcm38$ensgene %in% rownames(res1_ordered), ]
# check nrow ensgeneID
head(ensgeneID)
# create a vector of only the ensgeneIDs
ensgene_ID_vector = c(ensgeneID[[1]])
head(ensgene_ID_vector)
[1] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037"
[5] "ENSMUSG00000000049" "ENSMUSG00000000056"
# create a vector of only the entrezIDs
entrez_ID_vector = c(entrezID[[2]])
# create a vector of only the gene symbols
gene_symbols = subset(grcm38$symbol, grcm38$ensgene %in% rownames(res1_ordered))
# create new df that contains only the ensgeneIDs, lfc and padj
res2= data.frame(log2foldchange= subset(res1_ordered$log2FoldChange, grcm38$ensgene %in% rownames(res1_ordered)))
padj = subset(res1_ordered$padj, grcm38$ensgene %in% rownames(res1_ordered))
res2 = cbind(padj, res2)
res2 = cbind(ensgene_ID_vector, res2)
res2 = cbind(entrez_ID_vector, res2)
res2 = cbind(gene_symbols, res2)
# omit all "na" values
res2 = na.omit(res2)
res2 <- res2[order(res2$padj),]
head(res2)
## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. The background set is a vector of all the genes represented on the platform.
# bg entrez contains all the ensgene
allOEgenes = as.character(res2$ensgene_ID_vector)
# sig res entrez contains all the entrezIDs that have padj <0.05
head(res2)
sigOE <- subset(res2, padj< 0.05)
head(sigOE)
# vector of only lfc values
sigOE_genes = as.character(sigOE$ensgene_ID_vector)
head(sigOE_genes)
[1] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000037" "ENSMUSG00000000049"
[5] "ENSMUSG00000000056" "ENSMUSG00000000058"
## Run GO enrichment analysis
ego <- enrichGO(gene = sigOE_genes,
universe = allOEgenes,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool = TRUE)
## Output results from GO analysis to a table
cluster_summary <- data.frame(ego)
#gene_ratio = cluster_summary[order(cluster_summary$pvalue, decreasing = FALSE), ]
#head(gene_ratio)
ego
## Dotplot
dotplot(ego, showCategory=35)+theme(text = element_text(size = 1)) +scale_y_discrete(labels=function(x) str_wrap(x, width=40))+ggtitle('Enriched genes when comparing rtg4510 mice at 2 and 8 months')+ theme(plot.title = element_text(size=16))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
#barplot(ego, showCategory = 20)
options(ggrepel.max.overlaps = Inf)
## To color genes by log2 fold changes, we need to extract the log2 fold changes from our results table creating a named vector
OE_foldchanges <- sigOE$log2foldchange
names(OE_foldchanges) <- sigOE$gene_symbols
## Cnetplot details the genes associated with one or more terms - by default gives the top 1 significant term (by padj)
cnetplot(ego,
categorySize="pvalue",
showCategory = 5,
colorEdge = TRUE,
circular = FALSE,
node_label = "all",
cex_category = 1.5,
cex_gene = 0.75,
cex_label_category = 1.5,
cex_label_gene = 0.75,
shadowtext = "all")+ggtitle('Enriched Genes when comparing J20 mice at 6 and 12 months ')
NA
NA
NA
NA
library(enrichplot)
ego2 = pairwise_termsim(ego)
emapplot(ego2, showCategory = 20, colorEdge = TRUE)+ggtitle('Enriched Genes when comparing J20 mice at 6 and 12 months of age')
# Set-up
#BiocManager::install("SPIA")
library(SPIA)
## Significant genes is a vector of fold changes where the names are ENTREZ gene IDs. The background set is a vector of all the genes represented on the platform.
# bg entrez contains all the entrezIDs
background_entrez <- res2$entrez_ID_vector
# sig res entrez contains all the entrezIDs that have padj <0.05
sig_res_entrez <- res2[which(res2$padj < 0.05), ]
# vector of only lfc values
sig_entrez <- sig_res_entrez$log2foldchange
head(sig_entrez)
# adding entrezIDs as names for the sig entrez
names(sig_entrez) <- sig_res_entrez$entrez_ID_vector
head(sig_entrez)
# remove dups
dups<-unique(names(sig_entrez[which(duplicated(names(sig_entrez)))]))
sig_entrez<-sig_entrez[!(names(sig_entrez) %in% dups)]
#de= as.vector(sig_entrez)
#de = sort(de, decreasing = FALSE)
# this step takes time
spia_result <- spia(de=sig_entrez, all=background_entrez, organism="mmu", plots=FALSE)
write.csv(spia_result, file = "spia_result_J20_age.csv")
# view one record at a time
subset(spia_result, ID == "04727")
head(res1_ordered)
norm_counts_top_40 = normalizedcounts.matrix[row.names(head(res1_ordered, 40)), ]
nrow(norm_counts_top_40)
head(design_matrix)
annotation_columns<-design_matrix
row.names(annotation_columns) <- colnames(norm_counts_top_40)
library(pheatmap)
tiff("Heatmap_J20_age.tiff", width = 7, height = 5, units = 'in', res = 300)
pheatmap(norm_counts_top_40, color=colorRampPalette(c("white", "orchid3", "purple4"))(30), scale="row", cluster_cols = T, show_rownames = T,fontsize = 7,fontsize_row = 4, fontsize_col = 4,labels_row = rownames(dists),annotation_col =annotation_columns,main='Differentially Expressed Genes in J20 mice at 6 and 12 months old' )
dev.off()